This project is on Home Mortgage Disclosure Act Data(HMDA), which requires many financial institutions to maintain, report, and publicly disclose information about mortgages. This dataset covers mortgage decisions for first-lien, owner-occupied, 1-4 family house made in 2016 for the state of Florida. # Data wrangling and exploratory data analysis ## Download and Preprocess data The data of is downloaded from https://github.com/fengqifang/HMDA/raw/master/hmda_lar_FL_2016.zip. Descriptions for each column can be found in https://cfpb.github.io/api/hmda/fields.html.
library(data.table)
library(dplyr)
library(ggplot2)
library(kableExtra)
library(dygraphs)
library(knitr)
library(plotly)
library(stringr)
library(lubridate)
library(Matrix)
dat <- fread("hmda_lar_FL_2016.csv",na.strings = c("",NA))
##
Read 57.8% of 415338 rows
Read 415338 rows and 36 (of 36) columns from 0.237 GB file in 00:00:03
head(dat,n =3)
## tract_to_msamd_income rate_spread population minority_population
## 1: 109.08000183105469 NA 5310 23.6299991607666
## 2: 110.05999755859375 NA 2428 10.789999961853027
## 3: 148.5800018310547 NA 3943 85.47000122070312
## number_of_owner_occupied_units number_of_1_to_4_family_units
## 1: 1486 2252
## 2: 819 1465
## 3: 1041 925
## loan_amount_000s hud_median_family_income applicant_income_000s
## 1: 205 59600 NA
## 2: 156 51700 42
## 3: 143 48100 43
## state_name state_abbr sequence_number respondent_id
## 1: Florida FL 0026556 0000617677
## 2: Florida FL 0287915 22-3039688
## 3: Florida FL 0079973 38-2750395
## purchaser_type_name
## 1: Loan was not originated or was not sold in calendar year covered by register
## 2: Ginnie Mae (GNMA)
## 3: Freddie Mac (FHLMC)
## property_type_name
## 1: One-to-four family dwelling (other than manufactured housing)
## 2: One-to-four family dwelling (other than manufactured housing)
## 3: One-to-four family dwelling (other than manufactured housing)
## preapproval_name owner_occupancy_name
## 1: Preapproval was requested Owner-occupied as a principal dwelling
## 2: Not applicable Owner-occupied as a principal dwelling
## 3: Not applicable Owner-occupied as a principal dwelling
## msamd_name loan_type_name
## 1: Pensacola, Ferry Pass, Brent - FL FHA-insured
## 2: Deltona, Daytona Beach, Ormond Beach - FL FHA-insured
## 3: Miami, Miami Beach, Kendall - FL Conventional
## loan_purpose_name lien_status_name hoepa_status_name
## 1: Home purchase Secured by a first lien Not a HOEPA loan
## 2: Home purchase Secured by a first lien Not a HOEPA loan
## 3: Home purchase Secured by a first lien Not a HOEPA loan
## edit_status_name denial_reason_name_1 county_name
## 1: Quality edit failure only NA Escambia County
## 2: NA NA Volusia County
## 3: NA NA Miami-Dade County
## co_applicant_sex_name co_applicant_race_name_1
## 1: Female White
## 2: No co-applicant No co-applicant
## 3: No co-applicant No co-applicant
## co_applicant_ethnicity_name census_tract_number as_of_year
## 1: Not Hispanic or Latino 0036.14 2016
## 2: No co-applicant 0805.00 2016
## 3: No co-applicant 0168.00 2016
## application_date_indicator applicant_sex_name applicant_race_name_1
## 1: 0 Male White
## 2: 0 Male White
## 3: 0 Male White
## applicant_ethnicity_name agency_name
## 1: Not Hispanic or Latino Consumer Financial Protection Bureau
## 2: Not Hispanic or Latino Consumer Financial Protection Bureau
## 3: Hispanic or Latino Department of Housing and Urban Development
## agency_abbr
## 1: CFPB
## 2: CFPB
## 3: HUD
There are 36 columns and 415338 observations in the raw data. let’s look at the data type and missing data rate for each column.
dat %>% sapply(., function(x)sum(is.na(x))/length(x)) %>%
kable("html",col.names = "Missing.Rate") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| Missing.Rate | |
|---|---|
| tract_to_msamd_income | 0.0010714 |
| rate_spread | 0.9247890 |
| population | 0.0008957 |
| minority_population | 0.0008957 |
| number_of_owner_occupied_units | 0.0009462 |
| number_of_1_to_4_family_units | 0.0014157 |
| loan_amount_000s | 0.0000000 |
| hud_median_family_income | 0.0008788 |
| applicant_income_000s | 0.0754687 |
| state_name | 0.0000000 |
| state_abbr | 0.0000000 |
| sequence_number | 0.0000000 |
| respondent_id | 0.0000000 |
| purchaser_type_name | 0.0000000 |
| property_type_name | 0.0000000 |
| preapproval_name | 0.0000000 |
| owner_occupancy_name | 0.0000000 |
| msamd_name | 0.0142727 |
| loan_type_name | 0.0000000 |
| loan_purpose_name | 0.0000000 |
| lien_status_name | 0.0000000 |
| hoepa_status_name | 0.0000000 |
| edit_status_name | 0.8339184 |
| denial_reason_name_1 | 1.0000000 |
| county_name | 0.0003900 |
| co_applicant_sex_name | 0.0000000 |
| co_applicant_race_name_1 | 0.0000000 |
| co_applicant_ethnicity_name | 0.0000000 |
| census_tract_number | 0.0008788 |
| as_of_year | 0.0000000 |
| application_date_indicator | 0.0000000 |
| applicant_sex_name | 0.0000000 |
| applicant_race_name_1 | 0.0000000 |
| applicant_ethnicity_name | 0.0000000 |
| agency_name | 0.0000000 |
| agency_abbr | 0.0000000 |
Let’s examining the varibles one by one. The first 9 columns are continous number, and it needs to be converted numeric first. And for other column, we can convert them into factor variables for now.
names(dat)
## [1] "tract_to_msamd_income" "rate_spread"
## [3] "population" "minority_population"
## [5] "number_of_owner_occupied_units" "number_of_1_to_4_family_units"
## [7] "loan_amount_000s" "hud_median_family_income"
## [9] "applicant_income_000s" "state_name"
## [11] "state_abbr" "sequence_number"
## [13] "respondent_id" "purchaser_type_name"
## [15] "property_type_name" "preapproval_name"
## [17] "owner_occupancy_name" "msamd_name"
## [19] "loan_type_name" "loan_purpose_name"
## [21] "lien_status_name" "hoepa_status_name"
## [23] "edit_status_name" "denial_reason_name_1"
## [25] "county_name" "co_applicant_sex_name"
## [27] "co_applicant_race_name_1" "co_applicant_ethnicity_name"
## [29] "census_tract_number" "as_of_year"
## [31] "application_date_indicator" "applicant_sex_name"
## [33] "applicant_race_name_1" "applicant_ethnicity_name"
## [35] "agency_name" "agency_abbr"
dat <- dat %>% mutate_at(vars(1:9),funs(as.numeric(.)))
dat <- as.data.frame(unclass(dat))
str(dat)
## 'data.frame': 415338 obs. of 36 variables:
## $ tract_to_msamd_income : num 109 110 149 157 171 ...
## $ rate_spread : num NA NA NA NA NA NA 5.4 NA NA NA ...
## $ population : num 5310 2428 3943 6252 22670 ...
## $ minority_population : num 23.6 10.8 85.5 93 41 ...
## $ number_of_owner_occupied_units: num 1486 819 1041 1380 6133 ...
## $ number_of_1_to_4_family_units : num 2252 1465 925 1611 7918 ...
## $ loan_amount_000s : num 205 156 143 340 258 311 207 66 193 95 ...
## $ hud_median_family_income : num 59600 51700 48100 48100 57800 59600 59200 48100 48100 57800 ...
## $ applicant_income_000s : num NA 42 43 138 72 67 92 30 50 50 ...
## $ state_name : Factor w/ 1 level "Florida": 1 1 1 1 1 1 1 1 1 1 ...
## $ state_abbr : Factor w/ 1 level "FL": 1 1 1 1 1 1 1 1 1 1 ...
## $ sequence_number : Factor w/ 160472 levels "0000001","0000002",..: 26324 119968 64722 72968 294 91 51297 26667 279 122902 ...
## $ respondent_id : Factor w/ 1347 levels "0000000053","0000000058",..: 719 1001 1114 1243 1077 490 923 1136 1227 1001 ...
## $ purchaser_type_name : Factor w/ 10 levels "Affiliate institution",..: 8 6 5 3 6 7 10 3 2 3 ...
## $ property_type_name : Factor w/ 1 level "One-to-four family dwelling (other than manufactured housing)": 1 1 1 1 1 1 1 1 1 1 ...
## $ preapproval_name : Factor w/ 3 levels "Not applicable",..: 3 1 1 1 1 1 1 1 1 1 ...
## $ owner_occupancy_name : Factor w/ 1 level "Owner-occupied as a principal dwelling": 1 1 1 1 1 1 1 1 1 1 ...
## $ msamd_name : Factor w/ 24 levels "Cape Coral, Fort Myers - FL",..: 16 3 9 9 13 16 22 9 9 13 ...
## $ loan_type_name : Factor w/ 4 levels "Conventional",..: 2 2 1 1 2 1 1 1 1 1 ...
## $ loan_purpose_name : Factor w/ 3 levels "Home improvement",..: 2 2 2 3 2 3 2 3 3 3 ...
## $ lien_status_name : Factor w/ 1 level "Secured by a first lien": 1 1 1 1 1 1 1 1 1 1 ...
## $ hoepa_status_name : Factor w/ 2 levels "HOEPA loan","Not a HOEPA loan": 2 2 2 2 2 2 2 2 2 2 ...
## $ edit_status_name : Factor w/ 1 level "Quality edit failure only": 1 NA NA NA NA NA NA NA NA NA ...
## $ denial_reason_name_1 : Factor w/ 0 levels: NA NA NA NA NA NA NA NA NA NA ...
## $ county_name : Factor w/ 67 levels "Alachua County",..: 16 64 43 43 48 16 52 43 43 57 ...
## $ co_applicant_sex_name : Factor w/ 5 levels "Female","Information not provided by applicant in mail, Internet, or telephone application",..: 1 4 4 4 4 4 4 1 4 3 ...
## $ co_applicant_race_name_1 : Factor w/ 8 levels "American Indian or Alaska Native",..: 8 6 6 6 6 6 6 8 6 8 ...
## $ co_applicant_ethnicity_name : Factor w/ 5 levels "Hispanic or Latino",..: 5 3 3 3 3 3 3 1 3 5 ...
## $ census_tract_number : Factor w/ 2962 levels "0001.00","0001.01",..: 369 2554 1460 1413 1489 133 1845 649 1530 1699 ...
## $ as_of_year : Factor w/ 1 level "2016": 1 1 1 1 1 1 1 1 1 1 ...
## $ application_date_indicator : Factor w/ 1 level "0": 1 1 1 1 1 1 1 1 1 1 ...
## $ applicant_sex_name : Factor w/ 4 levels "Female","Information not provided by applicant in mail, Internet, or telephone application",..: 3 3 3 3 1 3 3 3 1 1 ...
## $ applicant_race_name_1 : Factor w/ 7 levels "American Indian or Alaska Native",..: 7 7 7 7 3 4 7 2 7 7 ...
## $ applicant_ethnicity_name : Factor w/ 4 levels "Hispanic or Latino",..: 4 4 1 4 4 4 4 4 1 4 ...
## $ agency_name : Factor w/ 6 levels "Consumer Financial Protection Bureau",..: 1 1 2 2 2 3 2 2 2 1 ...
## $ agency_abbr : Factor w/ 6 levels "CFPB","FDIC",..: 1 1 4 4 4 2 4 4 4 1 ...
tract_to_msamd_income It is the percentage of the median family income for the tract compared to the median family income for the MSA/MD, rounded to two decimal places.
p <-ggplot(dat, aes(x = "", y = tract_to_msamd_income,fill="tract_to_msamd_income[%]")) + geom_boxplot()
p <- ggplotly(p)
## Warning: Removed 445 rows containing non-finite values (stat_boxplot).
p
There are 445 missing values and the values ranging from 11 to 500.
rate_spread Rate spread for the loan is the difference between the loan’s annual percentage rate (APR) and the average prime offer rate (APOR). This value is after the loan approval, which alos can not be used in the prediction. It can be dropped directly. s
dat <- data.table(dat)[, rate_spread:= NULL]
population and minority_population There are 372 missing values in both variables.
p <- ggplot(dat,aes(population,fill = "population"))+geom_histogram(fill = "red")
p1 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 372 rows containing non-finite values (stat_bin).
p <- ggplot(dat,aes(minority_population,fill = "minority_population[%]"))+
geom_histogram(fill = "blue")
p2 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 372 rows containing non-finite values (stat_bin).
p <- subplot(p1,p2)
p
number_of_owner_occupied_units and number_of_1_to_4_family_units
p <- ggplot(dat,aes(number_of_owner_occupied_units,fill = "owener_occ"))+geom_histogram(fill = "red")
p1 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 393 rows containing non-finite values (stat_bin).
p <- ggplot(dat,aes(number_of_1_to_4_family_units,fill = "1 to 4 family units"))+
geom_histogram(fill = "blue")
p2 <- ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 588 rows containing non-finite values (stat_bin).
p <- subplot(p1,p2)
p
Impute the missing value with median.
dat <- dat %>% mutate_at(vars(number_of_owner_occupied_units,number_of_1_to_4_family_units),funs(ifelse(is.na(.),median(.),.)))
loan_amount_000s,hud_median_family_income,applicant_income_000s
summary(dat$loan_amount_000s)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 138.0 194.0 226.9 270.0 20800.0
summary(dat$loan_amount_000s)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 138.0 194.0 226.9 270.0 20800.0
These two variables have very long tails. We will need to transform it first and then plot
p <- ggplot(dat, aes(x="Loan amount",y = log(loan_amount_000s),fill = "loan_amount_000s")) + geom_boxplot(fill ="red")
p1 <- ggplotly(p)
p <- ggplot(dat, aes(x ="median family income",y = log(hud_median_family_income/1000),fill = "median_family_income_000s")) + geom_boxplot(fill ="blue")
p2 <- ggplotly(p)
## Warning: Removed 365 rows containing non-finite values (stat_boxplot).
p <- ggplot(dat, aes(x = "applicant income",y=log(applicant_income_000s),fill = "applicant_income_000s")) + geom_boxplot(fill = "green")
p3 <- ggplotly(p)
## Warning: Removed 31345 rows containing non-finite values (stat_boxplot).
subplot(p1,p2,p3)